In [33]:
# To create virtual env -python -m venv .venv
# Activate virtual env - .\.venv\Scripts\activate
# Install dependencies using the requirements.txt
# pip install -r requirements.txt
In [34]:
import pandas as pd
import numpy as np
import streamlit as st
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.io as pio
pio.renderers.default = "notebook_connected"
In [35]:
#Load the data
df = pd.read_csv("../data/processed/Data_Science_Challenge.csv")
In [36]:
pd.set_option('display.max_columns', None)
df.head()
Out[36]:
state account length area code phone number international plan voice mail plan number vmail messages total day minutes total day calls total day charge total eve minutes total eve calls total eve charge total night minutes total night calls total night charge total intl minutes total intl calls total intl charge customer service calls churn
0 KS 128 415 382-4657 no yes 25 265.1 110 45.07 197.4 99 16.78 244.7 91 11.01 10.0 3 2.70 1 False
1 OH 107 415 371-7191 no yes 26 161.6 123 27.47 195.5 103 16.62 254.4 103 11.45 13.7 3 3.70 1 False
2 NJ 137 415 358-1921 no no 0 243.4 114 41.38 121.2 110 10.30 162.6 104 7.32 12.2 5 3.29 0 False
3 OH 84 408 375-9999 yes no 0 299.4 71 50.90 61.9 88 5.26 196.9 89 8.86 6.6 7 1.78 2 False
4 OK 75 415 330-6626 yes no 0 166.7 113 28.34 148.3 122 12.61 186.9 121 8.41 10.1 3 2.73 3 False
In [37]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   state                   3333 non-null   object 
 1   account length          3333 non-null   int64  
 2   area code               3333 non-null   int64  
 3   phone number            3333 non-null   object 
 4   international plan      3333 non-null   object 
 5   voice mail plan         3333 non-null   object 
 6   number vmail messages   3333 non-null   int64  
 7   total day minutes       3333 non-null   float64
 8   total day calls         3333 non-null   int64  
 9   total day charge        3333 non-null   float64
 10  total eve minutes       3333 non-null   float64
 11  total eve calls         3333 non-null   int64  
 12  total eve charge        3333 non-null   float64
 13  total night minutes     3333 non-null   float64
 14  total night calls       3333 non-null   int64  
 15  total night charge      3333 non-null   float64
 16  total intl minutes      3333 non-null   float64
 17  total intl calls        3333 non-null   int64  
 18  total intl charge       3333 non-null   float64
 19  customer service calls  3333 non-null   int64  
 20  churn                   3333 non-null   bool   
dtypes: bool(1), float64(8), int64(8), object(4)
memory usage: 524.2+ KB
In [38]:
df.describe()
Out[38]:
account length area code number vmail messages total day minutes total day calls total day charge total eve minutes total eve calls total eve charge total night minutes total night calls total night charge total intl minutes total intl calls total intl charge customer service calls
count 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000
mean 101.064806 437.182418 8.099010 179.775098 100.435644 30.562307 200.980348 100.114311 17.083540 200.872037 100.107711 9.039325 10.237294 4.479448 2.764581 1.562856
std 39.822106 42.371290 13.688365 54.467389 20.069084 9.259435 50.713844 19.922625 4.310668 50.573847 19.568609 2.275873 2.791840 2.461214 0.753773 1.315491
min 1.000000 408.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 23.200000 33.000000 1.040000 0.000000 0.000000 0.000000 0.000000
25% 74.000000 408.000000 0.000000 143.700000 87.000000 24.430000 166.600000 87.000000 14.160000 167.000000 87.000000 7.520000 8.500000 3.000000 2.300000 1.000000
50% 101.000000 415.000000 0.000000 179.400000 101.000000 30.500000 201.400000 100.000000 17.120000 201.200000 100.000000 9.050000 10.300000 4.000000 2.780000 1.000000
75% 127.000000 510.000000 20.000000 216.400000 114.000000 36.790000 235.300000 114.000000 20.000000 235.300000 113.000000 10.590000 12.100000 6.000000 3.270000 2.000000
max 243.000000 510.000000 51.000000 350.800000 165.000000 59.640000 363.700000 170.000000 30.910000 395.000000 175.000000 17.770000 20.000000 20.000000 5.400000 9.000000
In [39]:
#Check for missing values
df.isnull().sum()
Out[39]:
state                     0
account length            0
area code                 0
phone number              0
international plan        0
voice mail plan           0
number vmail messages     0
total day minutes         0
total day calls           0
total day charge          0
total eve minutes         0
total eve calls           0
total eve charge          0
total night minutes       0
total night calls         0
total night charge        0
total intl minutes        0
total intl calls          0
total intl charge         0
customer service calls    0
churn                     0
dtype: int64
In [40]:
df.duplicated().sum()
Out[40]:
np.int64(0)
In [41]:
df.nunique()
Out[41]:
state                       51
account length             212
area code                    3
phone number              3333
international plan           2
voice mail plan              2
number vmail messages       46
total day minutes         1667
total day calls            119
total day charge          1667
total eve minutes         1611
total eve calls            123
total eve charge          1440
total night minutes       1591
total night calls          120
total night charge         933
total intl minutes         162
total intl calls            21
total intl charge          162
customer service calls      10
churn                        2
dtype: int64
In [42]:
# Exploratory Data Analysis

#Univariate Analysis with histograms

for col in df.select_dtypes(include=['int64', 'float64']).columns:
    fig = px.histogram(df, x=col, nbins=25, title=f'Histogram of {col}')
    fig.update_layout(xaxis_title=col, yaxis_title='Frequency', bargap=0.1)
    fig.show()
In [43]:
'''
Most of the features are normally distributed however there are some features that are skewed.
The features that are right skewed include
1. Total international calls
2. Number of customer service calls
'''
Out[43]:
'\nMost of the features are normally distributed however there are some features that are skewed.\nThe features that are right skewed include\n1. Total international calls\n2. Number of customer service calls\n'
In [44]:
# Using boxplots to identify outliers
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
df_melted = df.melt(value_vars=numerical_cols, var_name='Variable', value_name='Value')
fig = px.box(df_melted, x='Variable', y='Value', points='outliers', title='Boxplots for all Variables')
fig.update_layout(
    xaxis_title='Variables',
    yaxis_title='Values',
    xaxis={'tickangle': 45},
    boxmode='group'
)
fig.show()
In [45]:
# Getting counts of outliers

numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

outlier_counts = {}

for col in numerical_cols:
    Q1 = df[col].quantile(0.25)  
    Q3 = df[col].quantile(0.75)  
    IQR = Q3 - Q1                
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    
    outlier_counts[col] = len(outliers)

for col, count in outlier_counts.items():
    print(f"Column: {col}, Number of Outliers: {count}")
Column: account length, Number of Outliers: 18
Column: area code, Number of Outliers: 0
Column: number vmail messages, Number of Outliers: 1
Column: total day minutes, Number of Outliers: 25
Column: total day calls, Number of Outliers: 23
Column: total day charge, Number of Outliers: 25
Column: total eve minutes, Number of Outliers: 24
Column: total eve calls, Number of Outliers: 20
Column: total eve charge, Number of Outliers: 24
Column: total night minutes, Number of Outliers: 30
Column: total night calls, Number of Outliers: 22
Column: total night charge, Number of Outliers: 30
Column: total intl minutes, Number of Outliers: 46
Column: total intl calls, Number of Outliers: 78
Column: total intl charge, Number of Outliers: 49
Column: customer service calls, Number of Outliers: 267
In [46]:
# There are quite a few outliers in the variables however, we will deal with them depending on their correlation so we can accordingly transform or remove them.
In [47]:
# Bivariate Analysis and Correlation Matrix
imp_cols = df[numerical_cols].assign(churn=df['churn'])
%matplotlib inline
sns.pairplot(imp_cols, diag_kind='kde', corner=True, hue='churn')
plt.suptitle('Pairwise Plot of Numerical Variables', y=1.02)
plt.show()
No description has been provided for this image
In [48]:
'''
1. We can see that some variables are correlated with each other such as total day charge and total day minutes, total eve charge and total eve minutes, total night charge and total night minutes, total international charge and total international minutes.
2. Some variables are not normal distributed and have multiple peaks possibly suggesting grouping such as number of voicemail messages and customer service calls
'''
Out[48]:
'\n1. We can see that some variables are correlated with each other such as total day charge and total day minutes, total eve charge and total eve minutes, total night charge and total night minutes, total international charge and total international minutes.\n2. Some variables are not normal distributed and have multiple peaks possibly suggesting grouping such as number of voicemail messages and customer service calls\n'
In [49]:
correlation_matrix = df[numerical_cols].corr()

fig = px.imshow(
    correlation_matrix,
    text_auto=".2f",  # Add annotations with 2 decimal places
    color_continuous_scale="RdBu",  # Red-Blue color scale for contrast
    title="Correlation Heatmap",
    labels=dict(color="Correlation"),  # Add a color bar label
)

fig.update_layout(
    title_font_size=18,
    title_x=0.5,  # Center the title
    xaxis=dict(tickangle=45),  # Rotate x-axis labels for readability
    coloraxis_colorbar=dict(
        title="Correlation Coefficient",  # Color bar title
        tickvals=[-1, -0.5, 0, 0.5, 1],  # Define ticks for the color bar
        ticktext=["-1 (Negative)", "-0.5", "0 (None)", "0.5", "1 (Positive)"],  # Label ticks
    ),
    width=900,  # Adjust the width of the heatmap
    height=700,  # Adjust the height
)

fig.show()
In [65]:
# For better readability, we will only show the lower triangle of the correlation matrix
# Step 1: Mask the upper triangle
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))  # Upper triangle mask
correlation_matrix_masked = correlation_matrix.mask(mask)  # Apply the mask

# Step 2: Plot the heatmap
fig = px.imshow(
    correlation_matrix_masked,
    text_auto=".2f",  # Show values with 2 decimal places
    color_continuous_scale="RdBu",
    title="Correlation Heatmap (Lower Triangle Only)",
    labels=dict(color="Correlation"),
)

fig.update_layout(
    title_font_size=18,
    title_x=0.5,  # Center the title
    xaxis=dict(tickangle=45),  # Rotate x-axis labels
    coloraxis_colorbar=dict(
        title="Correlation Coefficient",
        tickvals=[-1, -0.5, 0, 0.5, 1],
        ticktext=["-1 (Negative)", "-0.5", "0 (None)", "0.5", "1 (Positive)"],
    ),
    width=800,
    height=800,
    margin=dict(
        l=170,  # Add left padding
        r=50,   # Add right padding
        t=50,   # Add top padding
        b=150   # Add bottom padding
    ),
)

fig.show()
In [51]:
# The relationship between variables observed in the correlation matrix is consistent with the pairplot.
# However, upon looking at the distributions with churn as a feature, we can see that the dataset is imbalanced. To confirm this we look at the distribution of churn.
In [52]:
churn_distribution = df['churn'].value_counts(normalize=True)
print(churn_distribution)
churn
False    0.855086
True     0.144914
Name: proportion, dtype: float64
In [53]:
# 86% of customers did not churn while 14% of customers churned.
In [54]:
# Using boxplots to compare numerical features by churn
for col in numerical_cols:
    fig = px.box(df, x='churn', y=col, color='churn', 
                 title=f'Distribution of {col} by Churn',
                 labels={'churn': 'Churn', col: col})
    fig.update_traces(quartilemethod="exclusive") 
    fig.show()
In [55]:
'''
1. We can observe that the distribution of customer service calls is more dispersed for customers who churned indicating that had more customer service calls that varied in experience.
2. Customers who churned spent more minutes talking during the day
3. The distribution of number of voicemail messages for customers who churned is essentially flat indicating that most churned customers have zero voicemail messages. It appears that those who don't use voicemail services would be likely to churn.
'''
Out[55]:
"\n1. We can observe that the distribution of customer service calls is more dispersed for customers who churned indicating that had more customer service calls that varied in experience.\n2. Customers who churned spent more minutes talking during the day\n3. The distribution of number of voicemail messages for customers who churned is essentially flat indicating that most churned customers have zero voicemail messages. It appears that those who don't use voicemail services would be likely to churn.\n"
In [56]:
# Running statistical tests on numerical variables

from scipy.stats import ttest_ind

results = {}
for col in numerical_cols:
    churned = df[df['churn'] == True][col]
    not_churned = df[df['churn'] == False][col]
    
    # Perform t-test
    t_stat, p_value = ttest_ind(churned, not_churned, equal_var=False)
    results[col] = {'T-statistic': t_stat, 'P-value': p_value}

# Print results
for col, stats in results.items():
    print(f"{col}:\n\tT-statistic: {stats['T-statistic']:.2f}, P-value: {stats['P-value']:.3f}\n")
account length:
	T-statistic: 0.96, P-value: 0.336

area code:
	T-statistic: 0.35, P-value: 0.724

number vmail messages:
	T-statistic: -5.82, P-value: 0.000

total day minutes:
	T-statistic: 9.68, P-value: 0.000

total day calls:
	T-statistic: 1.00, P-value: 0.317

total day charge:
	T-statistic: 9.68, P-value: 0.000

total eve minutes:
	T-statistic: 5.27, P-value: 0.000

total eve calls:
	T-statistic: 0.54, P-value: 0.591

total eve charge:
	T-statistic: 5.27, P-value: 0.000

total night minutes:
	T-statistic: 2.17, P-value: 0.030

total night calls:
	T-statistic: 0.35, P-value: 0.727

total night charge:
	T-statistic: 2.17, P-value: 0.030

total intl minutes:
	T-statistic: 3.94, P-value: 0.000

total intl calls:
	T-statistic: -2.96, P-value: 0.003

total intl charge:
	T-statistic: 3.94, P-value: 0.000

customer service calls:
	T-statistic: 8.96, P-value: 0.000

In [ ]:
# Keeping the p value threshold as 0.05 we can see a significant association with
'''
1. number vmail messages
2. total day minutes
3. total day charge
4. total eve minutes
5. total eve charge
6. total night minutes
7. total night charge
8. total intl minutes
9. total intl charge
10. customer service calls
'''
# this indicates that churned and non-churned customers have a significantly different average 1-10 ^^
Out[ ]:
''
In [67]:
# We move onto analyse the categorical variables.
In [58]:
categorical_cols = ['state', 'international plan', 'voice mail plan']

for col in categorical_cols:
    fig = px.histogram(df, x=col, title=f'Distribution of {col}', color_discrete_sequence=['black'])
    fig.update_layout(xaxis_title=col, yaxis_title="Count")
    fig.show()
In [59]:
# Majority of the customers did not have an international plan and did not have a voice mail plan.
#Further drilling down using the churned variable.
In [60]:
# Crosstab to analyze churn based on both international plan and voice mail plan
crosstab = pd.crosstab([df['international plan'], df['voice mail plan']], df['churn'])
print(crosstab)
churn                               False  True 
international plan voice mail plan              
no                 no                1878    302
                   yes                786     44
yes                no                 130    101
                   yes                 56     36
In [61]:
# Melt crosstab for visualization
crosstab_reset = crosstab.reset_index()
crosstab_melted = crosstab_reset.melt(id_vars=['international plan', 'voice mail plan'], 
                                      var_name='churn', value_name='count')

# Plot
fig = px.bar(
    crosstab_melted,
    x='international plan',
    y='count',
    color='churn',
    facet_col='voice mail plan',
    barmode='stack',
    title='Churn by International Plan and Voice Mail Plan',
    labels={'count': 'Customer Count', 'churn': 'Churn'},
    color_discrete_map={True: 'red', False: 'blue'}  # Explicitly map colors
)
fig.update_layout(
    showlegend=True,
    xaxis_title="International Plan",
    yaxis_title="Customer Count"
)
fig.show()
In [ ]:
from scipy.stats import chi2_contingency

contingency_table = pd.crosstab([df['international plan'], df['voice mail plan']], df['churn'])
print(contingency_table)

chi2, p, dof, expected = chi2_contingency(contingency_table)
print(f"Chi-square Statistic: {chi2:.2f}, p-value: {p:.3f}")
churn                               False  True 
international plan voice mail plan              
no                 no                1878    302
                   yes                786     44
yes                no                 130    101
                   yes                 56     36
Chi-square Statistic: 261.65, p-value: 0.000
In [ ]:
# Calculating churn rate
contingency_table['Total'] = contingency_table[False] + contingency_table[True]

contingency_table['Churn Rate'] = contingency_table[True] / contingency_table['Total']

print(contingency_table)
churn                               False  True  Total  Churn Rate
international plan voice mail plan                                
no                 no                1878   302   2180    0.138532
                   yes                786    44    830    0.053012
yes                no                 130   101    231    0.437229
                   yes                 56    36     92    0.391304
In [71]:
'''
We can see that customers with an international plan are more likely to churn than the ones without an international plan and the international plan variable seems to have more impact compared to the voice mail plan.
'''
Out[71]:
'\nWe can see that customers with an international plan are more likely to churn than the ones without an international plan and the international plan variable seems to have more impact compared to the voice mail plan.\n'
In [ ]:
'''
From EDA we can confirm that the variables influencing churn are:
1. number vmail messages
2. total day minutes
3. total day charge
4. total eve minutes
5. total eve charge
6. total night minutes
7. total night charge
8. total intl minutes
9. total intl charge
10. customer service calls
11. international plan
12. voice mail plan
'''
In [ ]: